Keith H. Turner (Twitter: @kay_aych)
Friday, May 12, 2017
ggplot and modeling
These tutorials are largely based on the book R for Data Science, which is also available for free online at https://r4ds.had.co.nz
ggplot and modeling
Much of today is based on based on the ggplot tutorial at the Harvard University Institute for Quantitative Social Science

data = mtcars)aes())geom_*())stat_*())scale_*())facet_*())ggplot(data, aes()) + geom_point() + coord_equal() + scale_x_log10() + theme_bw()# Run this code!
library(tidyverse)
housing <- read_csv("data/landdata-states.csv")
housing
# A tibble: 7,803 × 11
State region Date Home.Value Structure.Cost Land.Value
<chr> <chr> <dbl> <int> <int> <int>
1 AK West 2010.25 224952 160599 64352
2 AK West 2010.50 225511 160252 65259
3 AK West 2009.75 225820 163791 62029
4 AK West 2010.00 224994 161787 63207
5 AK West 2008.00 234590 155400 79190
6 AK West 2008.25 233714 157458 76256
7 AK West 2008.50 232999 160092 72906
8 AK West 2008.75 232164 162704 69460
9 AK West 2009.00 231039 164739 66299
10 AK West 2009.25 229395 165424 63971
# ... with 7,793 more rows, and 5 more variables: Land.Share..Pct. <dbl>,
# Home.Price.Index <dbl>, Land.Price.Index <dbl>, Year <int>, Qrtr <int>
hist(housing$Home.Value)
ggplot(housing, aes(x = Home.Value)) + geom_histogram()
plot(Home.Value ~ Date, data=subset(housing, State == "MA"))
points(Home.Value ~ Date, col="red", data=subset(housing, State == "TX"))
legend(1975, 400000,
c("MA", "TX"), title="State",
col=c("black", "red"),
pch=c(1, 1))
housing %>%
filter(State %in% c("MA", "TX")) %>%
ggplot(aes(Date, Home.Value, color = State)) +
geom_point()
data = mtcars)aes())geom_*())stat_*())scale_*())coord_*())facet_*())theme_*())aes(Year, Num_Cases, # x and y are default 1st and 2nd, all others must be named
color = Patient_Sex, # "outside" color of point, bar, shape, etc.
fill = Disease_Name, # "inside" color
alpha = Seriousness, # transparency
shape = Age_Group, # for points
size = Population_Share
# And MANY more, x/y min/max (for polygons), linetype (for lines), etc.
)
* x -> Year
* y -> Num_Cases
* colour -> Patient_Sex
* fill -> Disease_Name
* alpha -> Seriousness
* shape -> Age_Group
* size -> Population_Share
fontface?)
## One variable
geom_histogram()
## Two variables
# Continuous X, continuous Y
geom_point()
geom_smooth()
# Discrete X, continuous Y
geom_boxplot()
geom_violin()
# Continuous bivariate distribution
geom_bin2d()
From the Rstudio ggplot2 cheat sheet
ggplot(aes()) if not specifiedggplot(housing, aes(Date, Home.Value)) +
geom_point(aes(color = region))
x, y
color, fill, alpha, size, shapehousing %>% filter(Year == 2008) %>%
# Variables can be transformed in aes() calls
ggplot(aes(log10(Land.Value), Structure.Cost, color = Qrtr)) + geom_point()
x, y, label
color, alpha, size, fontface, …housing %>% filter(Year == 2008) %>%
# Variables can be transformed in aes() calls
ggplot(aes(log10(Land.Value), Structure.Cost, color = Qrtr, label = State)) + geom_text()
geom to a plothousing %>% filter(Year == 2008, Qrtr == 1) %>%
ggplot(aes(log10(Land.Value), Structure.Cost)) +
geom_point() +
geom_smooth()
housing %>% filter(Year == 2008, Qrtr == 1) %>%
ggplot(aes(log10(Land.Value), Structure.Cost, color = region)) +
geom_point() +
geom_smooth()
housing %>% filter(Year == 2008, Qrtr == 1) %>%
ggplot(aes(log10(Land.Value), Structure.Cost)) +
geom_point(aes(color = region)) +
geom_smooth()
aes(): map variable to aestheticsaes(): fixed aestheticshousing %>% filter(Year == 2008, Qrtr == 1) %>%
ggplot(aes(log10(Land.Value), Structure.Cost)) +
geom_point(aes(size = 2), color = "red")
Midwest regionhousing %>%
filter(region == "Midwest") %>%
ggplot(aes(Date, Home.Price.Index)) +
geom_point(aes(color = State)) +
geom_smooth()
data = mtcars)aes())geom_*())stat_*())scale_*())coord_*())facet_*())theme_*())geom_boxplot, geom_histogram, geom_smooth, etc., require transformations
stat_bin, stat_count, etc.)geoms are passed to the default statggplot(housing, aes(Home.Value)) + geom_histogram()
ggplot(housing, aes(Home.Value)) + geom_histogram(binwidth = 5000)
housing %>%
group_by(Year, region) %>%
summarize(mean_hpi = mean(Home.Price.Index)) %>%
ggplot(aes(Year, mean_hpi, fill = region)) +
geom_bar()
Error: stat_count() must not be used with a y aesthetic.
geom_bar is stat_count
y value itselfhousing %>%
group_by(Year, region) %>%
summarize(mean_hpi = mean(Home.Price.Index)) %>%
ggplot(aes(Year, mean_hpi, fill = region)) +
geom_bar(stat = "identity", position = "dodge")
geom_smooth onto Structure.Cost, one for each regiongeom_smooth onto Structure.Cost, one for each regionhousing %>%
ggplot(aes(Date, Structure.Cost, color = region)) +
geom_smooth(method = "lm")
data = mtcars)aes())geom_*())stat_*())scale_*())coord_*())facet_*())theme_*())scale_<aesthetic>_<type>
scale_x_log10()scale_color_discrete()scale_alpha_manual()scale_fill_brewer() - Color Brewer palettesnamelimitsbreaks - Where should the ticks appearlabels - The labels that appear at each breakp
p <- housing %>%
ggplot(aes(State, Home.Price.Index, color = Date)) +
geom_point(alpha = 0.5, position = position_jitter(width = 0.25, height = 0)) +
theme(legend.position = "top")
p
p <- p + scale_x_discrete("State Abbreviation") # or xlab("xyz") to just change name
p
p + scale_color_continuous("",
breaks = c(1976, 1992, 2008),
labels = c("'76", "'92", "'08"),
low = "blue", high = "red")
RColorBrewer::display.brewer.all() to see palettes)p + scale_color_distiller("", # _distiller for continuous, _brewer for discrete
palette = "Spectral",
breaks = c(1976, 1992, 2008),
labels = c("'76", "'92", "'08"))
data = mtcars)aes())geom_*())stat_*())scale_*())coord_*())facet_*())theme_*())facet_wrap(): wraparound series of plots on one (or more) variables
ggplot(housing, aes(Date, Home.Price.Index)) +
geom_line() +
facet_wrap(~State)
facet_wrap(): wraparound series of plots on one (or more) variables
ggplot(housing, aes(Date, Home.Price.Index)) +
geom_line() +
facet_wrap(~region + State)
facet_grid(): 2d grid of plots for two (or more) variables
labeller argument for more info on facet labelsggplot(mtcars, aes(wt, mpg)) +
geom_point() +
facet_grid(am~cyl, labeller = "label_both")
facet_grid(): 2d grid of plots for two (or more) variables
scales argument for free/fixed scale limitsggplot(mtcars, aes(wt, mpg)) +
geom_point() +
facet_grid(am~cyl, labeller = "label_both", scales = "free_x")
data = mtcars)aes())geom_*())stat_*())scale_*())coord_*())facet_*())theme_*())theme_bw()
theme_classic()
theme_dark()
ggplot(mtcars, aes(wt, mpg, color = factor(cyl))) +
geom_point() +
theme_minimal()
tidyverse
region can be found in housingnuforc <- read_csv("data/nuforc_events.csv")
# Get regions from housing data (should drop Canada b/c of inner_join)
housing %>% select(State, region) %>% unique %>% inner_join(nuforc) %>%
filter(!is.na(Year), !is.na(Shape), !is.na(region), Year > 1938) %>% # Clean
# Collapse rare shapes
group_by(Shape) %>% mutate(Shape_Disp = ifelse(n() > 5000, Shape, "Other")) %>%
group_by(Year, Shape_Disp, region) %>% tally %>% # tally = summarize(n = n())
ggplot(aes(Year, n, color = Shape_Disp)) +
geom_smooth(se = F) +
scale_color_brewer("UFO Shape", palette = "Set1") +
facet_wrap(~region, scales = "free_y") +
theme_minimal() +
ylab("Number of sightings")
#gds_eurovision_ussr on Slack!library(readxl)
former_ssrs <- c("Armenia", "Azerbaijan", "Belarus", "Estonia", "Georgia",
"Latvia", "Lithuania", "Moldova", "Poland", "Russia", "Ukraine")
eurovision_finals_ussr <-
read_excel("data/eurovision_song_contest_1975_2016.xlsx") %>%
filter(competition_round == "f", from_country != to_country) %>%
mutate(from_affiliation = ifelse(from_country %in% former_ssrs, "USSR", "Other"),
to_affiliation = ifelse(to_country %in% former_ssrs, "USSR", "Other"))
eurovision_finals_ussr %>%
mutate(to_affiliation = factor(to_affiliation, levels = c("USSR", "Other"))) %>% # Change the order so USSR is red (of course!)
ggplot(aes(year, points, color = to_affiliation)) +
geom_vline(xintercept = 1993.5) +
geom_jitter(alpha = 0.2, width = 0) +
geom_smooth() +
facet_wrap(~from_affiliation, labeller = "label_both") +
scale_color_brewer(palette = "Set1") +
theme_bw()
eurovision_finals_ussr %>%
filter(from_affiliation == "USSR") %>%
ggplot(aes(year, points, color = to_affiliation)) +
geom_jitter(alpha = 0.5) +
geom_smooth() +
scale_color_brewer(palette = "Dark2") +
facet_wrap(~from_country, labeller = "label_both") +
theme_bw()
ggplot and modeling
modelr - some nice functions like add_residuals(model, df)broom - extract key metrics from a model: tidy for coefficients, glance for a one-row summary with r2, AIC, etc.purrr - map function calls onto list-columns in tibbles (example later)library(randomForest)
rf <- randomForest(mpg ~ ., data = mtcars)
rf
Call:
randomForest(formula = mpg ~ ., data = mtcars)
Type of random forest: regression
Number of trees: 500
No. of variables tried at each split: 3
Mean of squared residuals: 5.547114
% Var explained: 84.24
tidyr::nest
dplyr syntaxmtcars %>% group_by(cyl) %>% nest
# A tibble: 3 × 2
cyl data
<dbl> <list>
1 6 <tibble [7 × 10]>
2 4 <tibble [11 × 10]>
3 8 <tibble [14 × 10]>
mtcars %>% nest(-cyl)
# A tibble: 3 × 2
cyl data
<dbl> <list>
1 6 <tibble [7 × 10]>
2 4 <tibble [11 × 10]>
3 8 <tibble [14 × 10]>
purrr::mapmtcars %>% nest(-cyl) %>%
mutate(mpg_rf = map(data, function(d) randomForest(mpg ~ ., data = d)))
# A tibble: 3 × 3
cyl data mpg_rf
<dbl> <list> <list>
1 6 <tibble [7 × 10]> <S3: randomForest.formula>
2 4 <tibble [11 × 10]> <S3: randomForest.formula>
3 8 <tibble [14 × 10]> <S3: randomForest.formula>
library(modelr)
mtcars %>% nest(-cyl) %>%
mutate(mpg_rf = map(data, function(d) randomForest(mpg ~ ., data = d)),
mpg_preds = map2(data, mpg_rf, add_predictions)) # map2 is for 2 arg functions
# A tibble: 3 × 4
cyl data mpg_rf mpg_preds
<dbl> <list> <list> <list>
1 6 <tibble [7 × 10]> <S3: randomForest.formula> <tibble [7 × 11]>
2 4 <tibble [11 × 10]> <S3: randomForest.formula> <tibble [11 × 11]>
3 8 <tibble [14 × 10]> <S3: randomForest.formula> <tibble [14 × 11]>
unnestmtcars %>% nest(-cyl) %>%
mutate(mpg_rf = map(data, function(d) randomForest(mpg ~ ., data = d)),
mpg_preds = map2(data, mpg_rf, add_predictions)) %>%
unnest(mpg_preds)
# A tibble: 32 × 12
cyl mpg disp hp drat wt qsec vs am gear carb
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 21.0 160.0 110 3.90 2.620 16.46 0 1 4 4
2 6 21.0 160.0 110 3.90 2.875 17.02 0 1 4 4
3 6 21.4 258.0 110 3.08 3.215 19.44 1 0 3 1
4 6 18.1 225.0 105 2.76 3.460 20.22 1 0 3 1
5 6 19.2 167.6 123 3.92 3.440 18.30 1 0 4 4
6 6 17.8 167.6 123 3.92 3.440 18.90 1 0 4 4
7 6 19.7 145.0 175 3.62 2.770 15.50 0 1 5 6
8 4 22.8 108.0 93 3.85 2.320 18.61 1 1 4 1
9 4 24.4 146.7 62 3.69 3.190 20.00 1 0 4 2
10 4 22.8 140.8 95 3.92 3.150 22.90 1 0 4 2
# ... with 22 more rows, and 1 more variables: pred <dbl>
mtcars %>% nest(-cyl) %>%
mutate(mpg_rf = map(data, function(d) randomForest(mpg ~ ., data = d)),
mpg_preds = map2(data, mpg_rf, add_predictions)) %>%
unnest(mpg_preds) %>%
ggplot(aes(mpg, pred, color = wt)) +
geom_abline() +
geom_point() +
scale_color_distiller(palette = "Spectral") +
facet_wrap(~cyl, scales = "free") +
theme_dark()
models_to_try <- list(function(d) lm(mpg ~ wt + disp, data = d),
function(d) lm(mpg ~ wt + disp + hp, data = d),
function(d) lm(mpg ~ wt * disp * hp, data = d))
lapply(models_to_try, function(modf) {
mtcars %>% nest(-cyl) %>%
mutate(mod = map(data, modf), gl = map(mod, broom::glance),
f = map_chr(mod, function(m) as.character(formula(m))[c(2,1,3)] %>%
paste(collapse = " "))) %>%
unnest(gl) }) %>% bind_rows %>%
ggplot(aes(f, BIC, fill = f)) + geom_bar(stat = "identity") +
scale_x_discrete("", breaks = NULL) +
scale_fill_brewer("lm formula", palette = "Dark2") +
facet_wrap(~cyl, labeller = "label_both") + theme_bw()
dplyr, readr, ggplot2dplyr, dplyr databases, broom